# Replication Archive for
# Coppock, Alexander. Generalizing from Survey Experiments Conducted on Mechanical Turk: A Replication Approach
# Forthcoming in Political Science Research and Methods
# These functions are adaptations of the method described in Ding, Feller, and Miratrix 2015



get_pos_constant_fx <- function(Y, Z, diff_in_means_vec) {
  Z_fac <- factor(Z)
  condition_names <- levels(Z_fac)
  Z_dum <- model.matrix( ~ Z_fac)[, -1, drop = FALSE]
  Y0 <- Y - Z_dum %*% diff_in_means_vec
  
  PO_mat <-
    matrix(rep(Y0, length(condition_names)), ncol = length(condition_names))
  tau_mat <- matrix(rep(c(0, diff_in_means_vec), length(Y)),
                    ncol = length(condition_names),
                    byrow = TRUE)
  PO_mat <- PO_mat + tau_mat
  colnames(PO_mat) <- condition_names
  return(PO_mat)
}

get_dif_in_means <- function(Y, Z) {
  Z_fac <- factor(Z)
  means_vec <- tapply(X = Y, INDEX = Z, FUN = mean)
  vars_vec <- tapply(X = Y, INDEX = Z, FUN = var)
  ns_vec <- tapply(X = Y, INDEX = Z, FUN = length)
  vars_over_ns_vec <- vars_vec / ns_vec
  
  diff_in_means_vec <- means_vec[-1] - means_vec[1]
  diff_in_means_ses_vec <-
    sqrt(vars_over_ns_vec[-1] + vars_over_ns_vec[1])
  diff_in_means_ci_upper_vec <-
    diff_in_means_vec + qnorm(p = 0.9999) * diff_in_means_ses_vec
  diff_in_means_ci_lower_vec <-
    diff_in_means_vec - qnorm(p = 0.9999) * diff_in_means_ses_vec
  
  df <- data.frame(
    ATEs = diff_in_means_vec,
    SEs = diff_in_means_ses_vec,
    UIs = diff_in_means_ci_upper_vec,
    LIs = diff_in_means_ci_lower_vec
  )
  return(df)
}

## Shifted kolmogorov-smirnov statistic
## Calculate KS distance between Y0 and Y1 shifted by estimated tau.
SKS.stat_AEC <- function(x, y) {
  Y1 = x
  Y0 = y
  
  Y1.star   = Y1 - mean(Y1)
  Y0.star   = Y0 - mean(Y0)
  
  unique.points = c(Y1.star, Y0.star)
  
  Fn1 = ecdf(Y1.star)
  Fn0 = ecdf(Y0.star)
  
  difference = Fn1(unique.points) - Fn0(unique.points)
  
  return(max(abs(difference)))
  
}

get_po_obs <- function(PO_mat, Z) {
  condition_names <- colnames(PO_mat)
  
  if (!all(condition_names %in% unique(Z))) {
    stop("Not all PO names are in Z names")
  }
  
  Y_obs <- rep(NA, length(Z))
  for (i in 1:length(condition_names)) {
    Y_obs[Z == condition_names[i]] <-
      PO_mat[Z == condition_names[i], condition_names[i]]
  }
  return(Y_obs)
}

seq_vectorized <-
  Vectorize(seq.default, vectorize.args = c("from", "to"))

homogenous_fx_test <-
  function(Y,
           Z,
           data,
           sims = 100,
           ksims = 25,
           CI_method = TRUE,
           verbose = TRUE) {
    dv <- deparse(substitute(Y))
    Y_na <-  eval(substitute(Y), data)
    if (!is.numeric(Y_na)) {
      stop("The outcome variable (Y) must be numeric.")
    }
    Z_na <-  eval(substitute(Z), data)
    
    Y <- Y_na[!is.na(Y_na) & !is.na(Z_na)]
    Z <- Z_na[!is.na(Y_na) & !is.na(Z_na)]
    Z <- droplevels(factor(Z))
    
    Ys <- split(x = Y, f = Z)
    Zs <- split(x = Z, f = Z)
    
    summary_df <- NULL
    bb_df <- data.frame(sim = 1:ksims)
    
    for (i in 2:length(Ys)) {
      out <-
        homogenous_fx_binary(
          Y = c(Ys[[1]], Ys[[i]]),
          Z = c(Zs[[1]], Zs[[i]]),
          sims = sims,
          ksims = ksims,
          CI_method = CI_method,
          verbose = verbose
        )
      summary_df <- rbind(summary_df, out$summary_df)
      bb_df <- cbind(bb_df, out$bb_df)
    }
    
    summary_df$condition_names <- levels(Z)[-1]
    summary_df$dv <- dv
    return(list(summary_df = summary_df, bb_df = bb_df))
  }



homogenous_fx_binary <-
  function(Y,
           Z,
           sims = 200,
           ksims = 10,
           CI_method = TRUE,
           verbose = TRUE) {
    if (length(unique(Z)) != 2) {
      stop("This test can only compare two treatments at a time.")
    }
    
    Z <- droplevels(factor(Z))
    Ys <- split(x = Y, f = Z)
    ks_obs <- sapply(Ys[-1], FUN = SKS.stat_AEC, y = Ys[[1]])
    
    d_i_m <- get_dif_in_means(Y, Z)
    
    PO_mat <-
      get_pos_constant_fx(Y, Z, diff_in_means_vec = d_i_m$ATEs)
    condition_names <- colnames(PO_mat)
    
    ks_sim <- matrix(NA, nrow = length(ks_obs), ncol = sims)
    for (i in 1:sims) {
      #Z_sim <- simple_ra(length(Z), condition_names = condition_names)
      Z_sim <- sample(Z)
      Z_sim <- factor(Z_sim, levels = levels(Z))
      Y_sim <- get_po_obs(PO_mat = PO_mat, Z = Z_sim)
      Ys_sim <- split(x = Y_sim, f = Z_sim)
      ks_sim[, i] <-
        sapply(Ys_sim[-1], FUN = SKS.stat_AEC, y = Ys_sim[[1]])
    }
    
    df <- data.frame(
      condition_names = condition_names[-1],
      ks_obs = ks_obs,
      ps = rowMeans(ks_sim > ks_obs),
      d_i_m
    )
    
    if (CI_method == TRUE) {
      ate_seqs <- seq_vectorized(from = d_i_m$LIs,
                                 to = d_i_m$UIs,
                                 length.out = ksims)
      ps_mat <- matrix(NA, nrow = ksims, ncol = length(ks_obs))
      
      colnames(ate_seqs) <- paste0(condition_names, "_ATE")[-1]
      colnames(ps_mat) <- paste0(condition_names, "_p")[-1]
      
      if (verbose) {
        pb <- txtProgressBar(min = 0,
                             max = ksims,
                             style = 3)
      }
      for (j in 1:ksims) {
        if (verbose) {
          setTxtProgressBar(pb, j)
        }
        PO_mat <-
          get_pos_constant_fx(Y, Z, diff_in_means_vec = ate_seqs[j, ])
        ks_sim <- matrix(NA, nrow = length(ks_obs), ncol = sims)
        for (i in 1:sims) {
          Z_sim <- sample(Z)
          Y_sim <- get_po_obs(PO_mat = PO_mat, Z = Z_sim)
          Ys_sim <- split(x = Y_sim, f = Z_sim)
          ks_sim[, i] <-
            sapply(Ys_sim[-1], FUN = SKS.stat_AEC, y = Ys_sim[[1]])
        }
        ps_mat[j, ] <- rowMeans(ks_sim > ks_obs)
      }
      
      if (verbose) {
        close(pb)
      }
      bb_df <- data.frame(ate_seqs, ps_mat)
      
      df$max_pval <- apply(ps_mat, 2, max)
    } else{
      bb_df <- NULL
    }
    rownames(df) <- NULL
    return(list(summary_df = df, bb_df = bb_df))
  }